import torch
import torch.nn as nn
import torch.nn.functional as F
from utils.masking import TriangularCausalMask, ProbMask
from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer
from layers.SelfAttention_Family import FullAttention, ProbAttention, AttentionLayer
from layers.Embed import DataEmbedding,DataEmbedding_only_timemodel,DataEmbedding_timemodel,DataEmbedding_wo_pos,DataEmbedding_wo_temp,DataEmbedding_wo_pos_temp
import numpy as np
import torch.fft


class moving_avg(nn.Module):
    """
    Moving average block to highlight the trend of time series
    """
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        # padding on the both ends of time series
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x


class series_decomp(nn.Module):
    """
    Series decomposition block
    """
    def __init__(self, kernel_size):
        super(series_decomp, self).__init__()
        self.moving_avg = moving_avg(kernel_size, stride=1)

    def forward(self, x):
        moving_mean = self.moving_avg(x)
        res = x - moving_mean
        return res, moving_mean


class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, filter_size, dropout_rate=0.1):
        super(FeedForwardNetwork, self).__init__()

        self.layer1 = nn.Linear(hidden_size, filter_size)
        self.relu = nn.Sigmoid()

        self.dropout = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(filter_size, hidden_size)

        self.initialize_weight(self.layer1)
        self.initialize_weight(self.layer2)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        # x = self.dropout(x)
        x = self.layer2(x)
        return x

    def initialize_weight(self, x):
        nn.init.xavier_uniform_(x.weight)
        if x.bias is not None:
            nn.init.constant_(x.bias, 0)

            
def FFT_for_Period(x, k=2):
    # [B, T, C]
    xf = torch.fft.rfft(x, dim=1)
    # find period by amplitudes
    frequency_list = abs(xf).mean(0).mean(-1)
    frequency_list[0] = 0
    _, top_list = torch.topk(frequency_list, k)
    top_list = top_list.detach().cpu().numpy()
    period = x.shape[1] // top_list
    return period, abs(xf).mean(-1)[:, top_list]


class nconv(nn.Module):
    def __init__(self):
        super(nconv, self).__init__()

    def forward(self, x, A):
        x = torch.einsum('btnp,pwv->btvp', (x, A))
        # x = torch.einsum('bpnt,pnc->bpct', (x, A))
        return x.contiguous()


class linear(nn.Module):
    def __init__(self, c_in, c_out):
        super(linear, self).__init__()
        self.mlp = torch.nn.Conv2d(c_in, c_out, kernel_size=(1, 1), padding=(0, 0), stride=(1, 1), bias=True)

    def forward(self, x):
        return self.mlp(x)


class gcn(nn.Module):
    def __init__(self, c_in, c_out, dropout, support_len=3, order=2):
        super(gcn, self).__init__()
        self.nconv = nconv()
        c_in = (order * support_len + 1) * c_in
        self.mlp = linear(c_in, c_out)
        self.dropout = dropout
        self.order = order

    def forward(self, x, support):
        # x = x.permute(0, 2, 1, 3)   # [B, N, P, T]
        # out = [x.permute(0, 3, 2, 1)]
        out = [x]
        for a in support:
            x1 = self.nconv(x, a)
            # out.append(x1.permute(0, 3, 2, 1))   # [B, T, N, P]
            out.append(x1)
            for k in range(2, self.order + 1):
                x2 = self.nconv(x1, a)
                # out.append(x2.permute(0, 3, 2, 1))
                out.append(x2)
                x1 = x2

        h = torch.cat(out, dim=1)   # [B, T, N, P]
        h = self.mlp(h)
        h = F.dropout(h, self.dropout, training=self.training).permute(0, 2, 3, 1)   # [B, N, P, T]
        return h


class Model(nn.Module):
    """
    Informer with Propspare attention in O(LlogL) complexity
    """
    def __init__(self, configs):
        super(Model, self).__init__()
        self.seq_len = configs.seq_len
        self.pred_len = configs.pred_len
        self.enc_in = configs.enc_in
        self.c_out = configs.c_out
        self.d_model = configs.d_model
        self.n_model = configs.n_model
        self.t_model = configs.t_model
        # self.conv_kernel = [1, 3, 4, 6]
        # self.sample_kernel = [14, 14, 14, 14]
        self.conv_kernel = configs.conv_kernel   #[1, 3, 4]
        self.sample_kernel = [2, 1, 1]
        self.period = configs.period   #[24, 56, 48]
        self.output_attention = configs.output_attention
        self.device = 'cuda:1'
        self.d_layers = configs.d_layers
        self.dropout = 0.05
        self.k = 2   # configs.top_k
        self.order = 2

        # Embedding
        if configs.embed_type == 0:
            self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
                                            configs.dropout)
            self.dec_embedding = DataEmbedding(configs.dec_in, configs.d_model, configs.embed, configs.freq,
                                           configs.dropout)
        elif configs.embed_type == 1:
            self.enc_embedding = DataEmbedding(configs.enc_in, configs.t_model, configs.embed, configs.freq,
                                                    configs.dropout)
            # self.dec_embedding = DataEmbedding(configs.dec_in, configs.d_model, configs.embed, configs.freq,
            #                                         configs.dropout)
            # self.enc_embedding = DataEmbedding_only_timemodel(configs.enc_in, configs.t_model, configs.d_model, configs.embed, configs.freq,configs.dropout)
            # self.enc_embedding = DataEmbedding_timemodel(configs.enc_in, configs.d_model, configs.embed, configs.freq,
            #                                    configs.dropout)
        elif configs.embed_type == 2:
            self.enc_embedding = DataEmbedding_wo_pos(configs.enc_in, configs.d_model, configs.embed, configs.freq,
                                                    configs.dropout)
            self.dec_embedding = DataEmbedding_wo_pos(configs.dec_in, configs.d_model, configs.embed, configs.freq,
                                                    configs.dropout)

        elif configs.embed_type == 3:
            self.enc_embedding = DataEmbedding_wo_temp(configs.enc_in, configs.d_model, configs.embed, configs.freq,
                                                    configs.dropout)
            self.dec_embedding = DataEmbedding_wo_temp(configs.dec_in, configs.d_model, configs.embed, configs.freq,
                                                    configs.dropout)
        elif configs.embed_type == 4:
            self.enc_embedding = DataEmbedding_wo_pos_temp(configs.enc_in, configs.d_model, configs.embed, configs.freq,
                                                    configs.dropout)
            self.dec_embedding = DataEmbedding_wo_pos_temp(configs.dec_in, configs.d_model, configs.embed, configs.freq,
                                                    configs.dropout)
        # if configs.freq == 't':
        #     self.Linear = nn.Linear(self.t_model * 5, self.pred_len)
        # else:
        #     self.Linear = nn.Linear(self.t_model * 4, self.pred_len)
        c = 0
        for i in range(len(self.conv_kernel)):
            c += (self.period[i] // self.conv_kernel[i])
        # c += self.period[0]
        self.Linear = nn.Linear((c)*self.t_model, self.pred_len)

        self.nodevec = nn.Parameter(torch.randn(self.enc_in, c).to(self.device), requires_grad=True).to(
            self.device)  # (num_nodes,10)
        # self.periodvec = nn.Parameter(torch.randn(self.period[0], c).to(self.device), requires_grad=True).to(self.device)
        # self.nodevec_pk = nn.Parameter(torch.randn(c, c, c).to(self.device), requires_grad=True).to(self.device)
        # self.gconv = nn.ModuleList()
        # self.s_conv = nn.ModuleList()
        # self.gconv.append(
        #     gcn(self.t_model, self.t_model, self.dropout, support_len=1, order=self.order))
        # self.s_conv.append(nn.Conv1d(in_channels=self.t_model, out_channels=self.t_model,
        #                             dilation=self.period[0] // self.conv_kernel[0], kernel_size=self.seq_len // self.period[0], padding=0, stride=1))

        self.conv1 = nn.ModuleList()
        self.conv2 = nn.ModuleList()
        self.sample_conv = nn.ModuleList()
        self.projection_sample = nn.ModuleList()
        self.projection_s = nn.ModuleList()

        # Add Conv
        # downsampling convolution: padding=i//2, stride=i
        for i in range(len(self.conv_kernel)):
            self.conv1.append(nn.Conv1d(in_channels=1, out_channels=1,
                                    kernel_size=self.conv_kernel[i], padding=self.conv_kernel[i] // 2, stride=1))
            self.conv2.append(nn.Conv1d(in_channels=1, out_channels=1,
                                    kernel_size=self.conv_kernel[i], padding=self.conv_kernel[i] // 2, stride=self.conv_kernel[i]))
            self.sample_conv.append(nn.Conv1d(in_channels=1, out_channels=self.t_model,
                                    dilation=self.period[i] // self.conv_kernel[i], kernel_size=self.seq_len // self.period[i], padding=0, stride=1))

            # self.projection_s.append(nn.Linear(self.t_model * (24 // self.conv_kernel[i]), self.t_model))
#         self.projection_sample.append(nn.Linear(self.t_model * len(self.conv_kernel), 24))

#         self.tweight = nn.Parameter(torch.randn(self.pred_len).to(self.device), requires_grad=True).to(
#             self.device)

    def dgconstruct(self, time_embedding, source_embedding, target_embedding, core_embedding):
        adp = torch.einsum('ai, ijk->ajk', time_embedding, core_embedding)
        adp = torch.einsum('bj, ajk->abk', source_embedding, adp)
        adp = torch.einsum('ck, abk->abc', target_embedding, adp)
        adp = F.softmax(F.relu(adp), dim=2)
        return adp

    def forward(self, x, x_mark_dec, y, y_mark_enc, x_fa):
        # x_enc  torch.Size([32, 336, 1])   x_mark_enc  torch.Size([32, 336, 4])
        # print("x_time ", x_time.shape)
        # x_time = self.enc_embedding(x, x_mark_dec)
        # y_time = self.enc_embedding(y, y_mark_enc)
        # x_out = self.Linear(x_time)
        # x = x - x_out
        
        # period_list, period_weight = FFT_for_Period(x, self.k)
        # print("period:", period_list)
        # print("period_weight:", period_weight)

        seq_last = x[:,-1:,:].detach()
        x = x - seq_last
        x_enc = x.permute(0, 2, 1).unsqueeze(dim=-2).reshape(-1, 1, self.seq_len)   # [B*self.enc_in, 1, self.seq_len]
        # Add Time Encoding
        # v_enc = x.permute(0, 2, 1).unsqueeze(dim=-1).reshape(-1, self.seq_len, 1)   # [B*self.enc_in, self.seq_len, 1]
        # [B, self.enc_in, self.seq_len, t_model]
        # v_enc = self.enc_embedding(v_enc, x_mark_dec).reshape(-1, self.enc_in, self.seq_len, self.t_model).permute(0, 3, 1, 2)
        # print("x_enc.shape", x_enc.shape)   # [13792, 24, 336]

        # Add Channel Dependence
        # adp = self.dgconstruct(self.periodvec, self.nodevec, self.nodevec, self.nodevec_pk)
        # new_supports = [adp]
        # out = v_enc #.reshape(-1, self.enc_in, c, self.t_model)
        # sv_enc = self.s_conv[0](out.reshape(-1, self.seq_len, self.t_model).permute(0, 2, 1))   # [B*self.enc_in, t_model, 24]
        # ssv_enc = self.gconv[0](sv_enc.reshape(-1, self.enc_in, self.t_model, self.period[0]).permute(0,2,1,3), new_supports)   # Concat or Add
        # ssv_enc = ssv_enc.permute(0,2,1,3).reshape(-1, self.t_model, self.period[0])   # btnp
        

        n_sample = torch.tensor([], device=self.device)
        c = 0
        for i in range(len(self.conv_kernel)):
            c += (self.period[i] // self.conv_kernel[i])
            sample_out = self.conv1[i](x_enc)   # [B*self.enc_in, 1, self.seq_len]
            sample_out = self.conv2[i](sample_out)   # [B*self.enc_in, 1, self.seq_len]
            sample_out = self.sample_conv[i](sample_out)  # [B*self.enc_in, self.t_model, 24]
            
            if sample_out.shape[-1] > (self.period[i] // self.conv_kernel[i]):
                n_sample = torch.cat((n_sample, sample_out[:,:,0:(self.period[i] // self.conv_kernel[i])]), dim=-1)
            else:
                
                n_sample = torch.cat((n_sample, sample_out), dim=-1)
                
            # if i == 0:
            #     n_sample = n_sample + ssv_enc
            #     new_supports = [adp]
            #     out = n_sample.reshape(-1, self.enc_in, c, self.t_model)
            #     n_sample = self.gconv[i](out, new_supports).reshape(-1, c, self.t_model).permute(0, 2, 1)
            #     n_pattern = n_sample.reshape(-1, self.enc_in, c, self.t_model)   # [B, 7, 24, self.t_model]
            #     n_pattern = n_pattern.permute(0, 2, 1, 3)   # [B, 24, 7, self.t_model]
            #     dynamic_adj = nn.functional.softmax(torch.matmul(n_pattern, n_pattern.permute(0, 1, 3, 2)), dim=-1)   # [B, 24, 7, 7]
            #     res_graph = torch.matmul(self.nodevec, self.nodevec.transpose(0, 1))
            #     adj = torch.softmax(torch.relu(res_graph), dim=1)
            #     dynamic_adj = dynamic_adj + adj
            #     n_sample = torch.einsum('bpcd,bpcc->bpcd', (n_pattern, dynamic_adj)) + n_pattern   # [B, 24, 7, self.t_model]
            #     n_sample = n_sample.permute(0, 2, 1, 3).reshape(-1, c, self.t_model).permute(0, 2, 1)


        # Node_vec直接sigmod后，乘以pattern
        # c += self.period[0]
        # n_sample = torch.cat((n_sample, ssv_enc), dim=-1)
        dec_out = n_sample.reshape(-1, self.enc_in, c, self.t_model)  # [B, 7, 42, self.t_model]
        # dec_out = torch.einsum('bcmd,cm->bcd', (dec_out, nodevec))  # [B, 7, self.t_model]
        
#         res_graph = torch.matmul(self.nodevec, self.nodevec.transpose(0, 1))

#         adj = torch.softmax(torch.relu(res_graph), dim=1)

#         dec_out = torch.einsum('bcmd,cc->bcmd', (dec_out, adj)) + dec_out

        nodevec = torch.sigmoid(self.nodevec).unsqueeze(dim=0).unsqueeze(dim=-1)   # 横着乘
        dec_out = dec_out * nodevec
        
        # spatial Attention
        # print("mg_sample.shape", mg_sample.shape)
        # mg_att = torch.einsum('bctd,cd->bct', (dec_out, self.nodevec)).contiguous()
        # mg_score = nn.functional.softmax(mg_att, dim=-1)  # [B, 7, 42]
        # mg_s = torch.einsum('bctd,bct->bcd', (dec_out, mg_score))   # [B, 7, self.t_model]

        dec = self.Linear(dec_out.reshape(-1,self.enc_in, (c) * self.t_model)).permute(0, 2, 1)
        
        dec=seq_last.repeat(1,self.pred_len,1)
        # dec = dec + seq_last

        return dec